{
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# Chi-Squared For Feature Selection"
      ],
      "metadata": {
        "nteract": {
          "transient": {
            "deleting": false
          }
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "import matplotlib.pyplot as plt\n",
        "import pandas as pd\n",
        "\n",
        "import warnings\n",
        "warnings.filterwarnings(\"ignore\")\n",
        "\n",
        "import yfinance as yf\n",
        "yf.pdr_override()"
      ],
      "outputs": [],
      "execution_count": 1,
      "metadata": {
        "collapsed": true,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-06-28T18:16:39.209Z",
          "iopub.execute_input": "2020-06-28T18:16:39.213Z",
          "shell.execute_reply": "2020-06-28T18:16:40.159Z",
          "iopub.status.idle": "2020-06-28T18:16:40.130Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# input\n",
        "symbol = 'AMD'\n",
        "start = '2014-01-01'\n",
        "end = '2020-01-01'\n",
        "\n",
        "# Read data \n",
        "dataset = yf.download(symbol,start,end)\n",
        "\n",
        "# View Columns\n",
        "dataset.head()"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[*********************100%***********************]  1 of 1 completed\n"
          ]
        },
        {
          "output_type": "execute_result",
          "execution_count": 2,
          "data": {
            "text/plain": [
              "            Adj Close  Close  High   Low  Open    Volume\n",
              "Date                                                    \n",
              "2014-01-02       3.95   3.95  3.98  3.84  3.85  20548400\n",
              "2014-01-03       4.00   4.00  4.00  3.88  3.98  22887200\n",
              "2014-01-06       4.13   4.13  4.18  3.99  4.01  42398300\n",
              "2014-01-07       4.18   4.18  4.25  4.11  4.19  42932100\n",
              "2014-01-08       4.18   4.18  4.26  4.14  4.23  30678700"
            ],
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Adj Close</th>\n",
              "      <th>Close</th>\n",
              "      <th>High</th>\n",
              "      <th>Low</th>\n",
              "      <th>Open</th>\n",
              "      <th>Volume</th>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>Date</th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>2014-01-02</th>\n",
              "      <td>3.95</td>\n",
              "      <td>3.95</td>\n",
              "      <td>3.98</td>\n",
              "      <td>3.84</td>\n",
              "      <td>3.85</td>\n",
              "      <td>20548400</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2014-01-03</th>\n",
              "      <td>4.00</td>\n",
              "      <td>4.00</td>\n",
              "      <td>4.00</td>\n",
              "      <td>3.88</td>\n",
              "      <td>3.98</td>\n",
              "      <td>22887200</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2014-01-06</th>\n",
              "      <td>4.13</td>\n",
              "      <td>4.13</td>\n",
              "      <td>4.18</td>\n",
              "      <td>3.99</td>\n",
              "      <td>4.01</td>\n",
              "      <td>42398300</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2014-01-07</th>\n",
              "      <td>4.18</td>\n",
              "      <td>4.18</td>\n",
              "      <td>4.25</td>\n",
              "      <td>4.11</td>\n",
              "      <td>4.19</td>\n",
              "      <td>42932100</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2014-01-08</th>\n",
              "      <td>4.18</td>\n",
              "      <td>4.18</td>\n",
              "      <td>4.26</td>\n",
              "      <td>4.14</td>\n",
              "      <td>4.23</td>\n",
              "      <td>30678700</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 2,
      "metadata": {
        "collapsed": true,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-06-28T18:16:40.142Z",
          "iopub.execute_input": "2020-06-28T18:16:40.149Z",
          "iopub.status.idle": "2020-06-28T18:16:41.356Z",
          "shell.execute_reply": "2020-06-28T18:16:41.500Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "dataset['Increase_Decrease'] = np.where(dataset['Volume'].shift(-1) > dataset['Volume'],1,0)\n",
        "dataset['Buy_Sell_on_Open'] = np.where(dataset['Open'].shift(-1) > dataset['Open'],1,0)\n",
        "dataset['Buy_Sell'] = np.where(dataset['Adj Close'].shift(-1) > dataset['Adj Close'],1,0)\n",
        "dataset['Returns'] = dataset['Adj Close'].pct_change()\n",
        "dataset = dataset.dropna()"
      ],
      "outputs": [],
      "execution_count": 3,
      "metadata": {
        "collapsed": true,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-06-28T18:16:41.367Z",
          "iopub.execute_input": "2020-06-28T18:16:41.374Z",
          "iopub.status.idle": "2020-06-28T18:16:41.384Z",
          "shell.execute_reply": "2020-06-28T18:16:41.504Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "dataset.head()"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 4,
          "data": {
            "text/plain": [
              "            Adj Close  Close  High   Low  Open    Volume  Increase_Decrease  \\\n",
              "Date                                                                          \n",
              "2014-01-03       4.00   4.00  4.00  3.88  3.98  22887200                  1   \n",
              "2014-01-06       4.13   4.13  4.18  3.99  4.01  42398300                  1   \n",
              "2014-01-07       4.18   4.18  4.25  4.11  4.19  42932100                  0   \n",
              "2014-01-08       4.18   4.18  4.26  4.14  4.23  30678700                  0   \n",
              "2014-01-09       4.09   4.09  4.23  4.05  4.20  30667600                  0   \n",
              "\n",
              "            Buy_Sell_on_Open  Buy_Sell   Returns  \n",
              "Date                                              \n",
              "2014-01-03                 1         1  0.012658  \n",
              "2014-01-06                 1         1  0.032500  \n",
              "2014-01-07                 1         0  0.012106  \n",
              "2014-01-08                 0         0  0.000000  \n",
              "2014-01-09                 0         1 -0.021531  "
            ],
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Adj Close</th>\n",
              "      <th>Close</th>\n",
              "      <th>High</th>\n",
              "      <th>Low</th>\n",
              "      <th>Open</th>\n",
              "      <th>Volume</th>\n",
              "      <th>Increase_Decrease</th>\n",
              "      <th>Buy_Sell_on_Open</th>\n",
              "      <th>Buy_Sell</th>\n",
              "      <th>Returns</th>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>Date</th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>2014-01-03</th>\n",
              "      <td>4.00</td>\n",
              "      <td>4.00</td>\n",
              "      <td>4.00</td>\n",
              "      <td>3.88</td>\n",
              "      <td>3.98</td>\n",
              "      <td>22887200</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>0.012658</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2014-01-06</th>\n",
              "      <td>4.13</td>\n",
              "      <td>4.13</td>\n",
              "      <td>4.18</td>\n",
              "      <td>3.99</td>\n",
              "      <td>4.01</td>\n",
              "      <td>42398300</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>0.032500</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2014-01-07</th>\n",
              "      <td>4.18</td>\n",
              "      <td>4.18</td>\n",
              "      <td>4.25</td>\n",
              "      <td>4.11</td>\n",
              "      <td>4.19</td>\n",
              "      <td>42932100</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "      <td>0</td>\n",
              "      <td>0.012106</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2014-01-08</th>\n",
              "      <td>4.18</td>\n",
              "      <td>4.18</td>\n",
              "      <td>4.26</td>\n",
              "      <td>4.14</td>\n",
              "      <td>4.23</td>\n",
              "      <td>30678700</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2014-01-09</th>\n",
              "      <td>4.09</td>\n",
              "      <td>4.09</td>\n",
              "      <td>4.23</td>\n",
              "      <td>4.05</td>\n",
              "      <td>4.20</td>\n",
              "      <td>30667600</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "      <td>-0.021531</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 4,
      "metadata": {
        "collapsed": true,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-06-28T18:16:41.394Z",
          "iopub.execute_input": "2020-06-28T18:16:41.403Z",
          "iopub.status.idle": "2020-06-28T18:16:41.416Z",
          "shell.execute_reply": "2020-06-28T18:16:42.108Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "dataset.shape"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 5,
          "data": {
            "text/plain": [
              "(1509, 10)"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 5,
      "metadata": {
        "collapsed": true,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-06-28T18:16:41.425Z",
          "iopub.execute_input": "2020-06-28T18:16:41.434Z",
          "iopub.status.idle": "2020-06-28T18:16:41.447Z",
          "shell.execute_reply": "2020-06-28T18:16:42.111Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "X = dataset[['Open', 'High', 'Low','Adj Close', 'Volume', 'Buy_Sell_on_Open', 'Buy_Sell', 'Returns']]\n",
        "X = X.astype(int)\n",
        "y = dataset['Increase_Decrease']"
      ],
      "outputs": [],
      "execution_count": 6,
      "metadata": {
        "collapsed": true,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-06-28T18:16:41.456Z",
          "iopub.execute_input": "2020-06-28T18:16:41.462Z",
          "iopub.status.idle": "2020-06-28T18:16:41.471Z",
          "shell.execute_reply": "2020-06-28T18:16:42.114Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.feature_selection import SelectKBest\n",
        "from sklearn.feature_selection import chi2"
      ],
      "outputs": [],
      "execution_count": 7,
      "metadata": {
        "collapsed": true,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-06-28T18:16:41.478Z",
          "iopub.execute_input": "2020-06-28T18:16:41.483Z",
          "iopub.status.idle": "2020-06-28T18:16:41.968Z",
          "shell.execute_reply": "2020-06-28T18:16:42.117Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Select two features with highest chi-squared statistics\n",
        "chi2_selector = SelectKBest(chi2, k=2)\n",
        "X_kbest = chi2_selector.fit_transform(X, y)"
      ],
      "outputs": [],
      "execution_count": 8,
      "metadata": {
        "collapsed": true,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-06-28T18:16:41.979Z",
          "iopub.execute_input": "2020-06-28T18:16:41.987Z",
          "iopub.status.idle": "2020-06-28T18:16:41.999Z",
          "shell.execute_reply": "2020-06-28T18:16:42.119Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "print('Original number of features:', X.shape[1])\n",
        "print('Reduced number of features:', X_kbest.shape[1])"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Original number of features: 8\n",
            "Reduced number of features: 2\n"
          ]
        }
      ],
      "execution_count": 9,
      "metadata": {
        "collapsed": true,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-06-28T18:16:42.008Z",
          "iopub.execute_input": "2020-06-28T18:16:42.012Z",
          "iopub.status.idle": "2020-06-28T18:16:42.024Z",
          "shell.execute_reply": "2020-06-28T18:16:42.123Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "chi2_selector.scores_"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 10,
          "data": {
            "text/plain": [
              "array([ 1.15639891e+00,  3.63046938e-01,  2.32221217e+00,  1.56309524e+00,\n",
              "       -1.17516984e+10,  1.81270501e-02,  2.02990321e+00,             nan])"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 10,
      "metadata": {
        "collapsed": true,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-06-28T18:16:42.030Z",
          "iopub.execute_input": "2020-06-28T18:16:42.034Z",
          "iopub.status.idle": "2020-06-28T18:16:42.042Z",
          "shell.execute_reply": "2020-06-28T18:16:42.126Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "chi2_selector.pvalues_"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 11,
          "data": {
            "text/plain": [
              "array([0.28221363, 0.54681889, 0.12753853, 0.21121259, 1.        ,\n",
              "       0.8928991 , 0.15423043,        nan])"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 11,
      "metadata": {
        "collapsed": true,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-06-28T18:16:42.048Z",
          "iopub.execute_input": "2020-06-28T18:16:42.052Z",
          "iopub.status.idle": "2020-06-28T18:16:42.062Z",
          "shell.execute_reply": "2020-06-28T18:16:42.129Z"
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "names = X.columns.values[chi2_selector.get_support()]\n",
        "scores = chi2_selector.scores_[chi2_selector.get_support()]\n",
        "names_scores = list(zip(names, scores))\n",
        "ns_df = pd.DataFrame(data = names_scores, columns=['Feat_names', 'Chi-Squared'])\n",
        "#Sort the dataframe for better visualization\n",
        "ns_df_sorted = ns_df.sort_values(['Chi-Squared', 'Feat_names'], ascending = [False, True])\n",
        "print(ns_df_sorted)"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "  Feat_names  Chi-Squared\n",
            "0        Low     2.322212\n",
            "1   Buy_Sell     2.029903\n"
          ]
        }
      ],
      "execution_count": 12,
      "metadata": {
        "collapsed": true,
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "execution": {
          "iopub.status.busy": "2020-06-28T18:16:42.068Z",
          "iopub.execute_input": "2020-06-28T18:16:42.076Z",
          "iopub.status.idle": "2020-06-28T18:16:42.085Z",
          "shell.execute_reply": "2020-06-28T18:16:42.134Z"
        }
      }
    }
  ],
  "metadata": {
    "kernel_info": {
      "name": "python3"
    },
    "language_info": {
      "mimetype": "text/x-python",
      "version": "3.5.5",
      "pygments_lexer": "ipython3",
      "name": "python",
      "codemirror_mode": {
        "version": 3,
        "name": "ipython"
      },
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    },
    "kernelspec": {
      "argv": [
        "C:\\Users\\Tin Hang\\Anaconda3\\envs\\py35\\python.exe",
        "-m",
        "ipykernel_launcher",
        "-f",
        "{connection_file}"
      ],
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "nteract": {
      "version": "0.23.3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}